In [54]:
import os
import subprocess
import donuts.deconv.utils as du
import donuts.deconv.ncx as dn
import nibabel as nib
import numpy as np
import numpy.random as npr
import scipy.optimize as spo
import numpy.linalg as nla
from operator import add
from pyspark.mllib.clustering import KMeans
import matplotlib.pyplot as plt
In [39]:
from pyspark.mllib.clustering import KMeans
In [2]:
os.chdir('/root/data')
subprocess.check_output('aws s3 cp s3://rawpredator/8907_7/b0_corrected.nii.gz 8907_7_b0_corrected.nii.gz', shell=True)
Out[2]:
'Completed 1 of 22 part(s) with 1 file(s) remaining\rCompleted 2 of 22 part(s) with 1 file(s) remaining\rCompleted 3 of 22 part(s) with 1 file(s) remaining\rCompleted 4 of 22 part(s) with 1 file(s) remaining\rCompleted 5 of 22 part(s) with 1 file(s) remaining\rCompleted 6 of 22 part(s) with 1 file(s) remaining\rCompleted 7 of 22 part(s) with 1 file(s) remaining\rCompleted 8 of 22 part(s) with 1 file(s) remaining\rCompleted 9 of 22 part(s) with 1 file(s) remaining\rCompleted 10 of 22 part(s) with 1 file(s) remaining\rCompleted 11 of 22 part(s) with 1 file(s) remaining\rCompleted 12 of 22 part(s) with 1 file(s) remaining\rCompleted 13 of 22 part(s) with 1 file(s) remaining\rCompleted 14 of 22 part(s) with 1 file(s) remaining\rCompleted 15 of 22 part(s) with 1 file(s) remaining\rCompleted 16 of 22 part(s) with 1 file(s) remaining\rCompleted 17 of 22 part(s) with 1 file(s) remaining\rCompleted 18 of 22 part(s) with 1 file(s) remaining\rCompleted 19 of 22 part(s) with 1 file(s) remaining\rCompleted 20 of 22 part(s) with 1 file(s) remaining\rCompleted 21 of 22 part(s) with 1 file(s) remaining\rCompleted 22 of 22 part(s) with 1 file(s) remaining\rdownload: s3://rawpredator/8907_7/b0_corrected.nii.gz to ./8907_7_b0_corrected.nii.gz\n'
In [42]:
coil_data1 = nib.load('/root/data/8907_7_b0_corrected.nii.gz').get_data()
coil_data1 = coil_data1[:, :, :, 2:]
In [43]:
np.shape(coil_data1)
Out[43]:
(120, 120, 69, 44)
In [9]:
def standardize(x):
    return (x - np.mean(x))/np.std(x)

def sdot(a, b):
    return np.squeeze(np.dot(a, b))

def par(st):
    return np.array([float(v) for v in str(st).split(' ')])

def depar(x):
    return ' '.join(str(xx) for xx in x)

def arr_red(x, y):
    return np.vstack([x, y])

def array_to_rdd(a, parts = 48, rm = False, fn = 0):
    if fn == 0:
        fn = 'temp' + str(npr.randint(0, 999999)) + '.txt'
    os.chdir('/root/ephemeral-hdfs/bin')
    np.savetxt(fn, a, fmt = '%.9e')
    os.system('./hadoop fs -put ' + fn + ' ' + fn)
    rawrdd = sc.textFile(fn, parts).map(par).cache()
    rawrdd.count()
    if rm:
        os.system('rm ' + fn)
    return rawrdd

def rdd_to_array(rawrdd, parts = 48, rm = False, fn = 0):
    if fn == 0:
        fn = 'temp' + str(npr.randint(0, 999999)) + '.txt'
    os.chdir('/root/ephemeral-hdfs/bin')
    rawrdd.map(depar).saveAsTextFile(fn)
    os.system('./hadoop fs -getmerge ' + fn + ' ' + fn)
    a = np.loadtxt(fn)
    # cleanup
    os.system('rm ' + fn)
    os.system('./hadoop fs -rmr ' + fn)
    return a

def rank1_approx(a):
    u, s, v = nla.svd(a, False)
    return s[0] * np.outer(u[:, 0], v[0, :])
In [44]:
def demean(x):
    return x - np.mean(x)

coil_dm1 = np.apply_along_axis(demean, 3, coil_data1)
In [6]:
np.mean(coil_dm1[20, 20, 20, :])
Out[6]:
1.9760236e-08
In [7]:
np.shape(coil_dm1)
Out[7]:
(120, 120, 69, 46)
In [45]:
w_inds = range(44)
vmin_2 = -1.0
vmax_2 = 1.0
z_inds = [30, 40, 50, 60]
figsize_ = (3, 3)
figsize_2 = (len(z_inds) * figsize_[0], len(w_inds) * figsize_[1])
f_sv, axarr = plt.subplots(len(w_inds), len(z_inds), sharex=True, sharey=True, figsize = figsize_2)
for i in range(len(w_inds)):
    w_ind = w_inds[i]
    for j in range(len(z_inds)):
        z_ind = z_inds[j]
        axarr[i, j].imshow(coil_dm1[:, :, z_ind, w_ind], cmap='bone', vmin = vmin_2, vmax = vmax_2)
In [46]:
coil_mu = np.apply_along_axis(np.mean, 3, coil_data1)
In [47]:
plt.hist(coil_mu.ravel())
Out[47]:
(array([  5.42520000e+05,   2.68875000e+05,   1.24415000e+05,
          3.98150000e+04,   1.32650000e+04,   3.43600000e+03,
          9.51000000e+02,   2.35000000e+02,   8.00000000e+01,
          8.00000000e+00]),
 array([ -2.47574067,   0.40641725,   3.28857517,   6.17073309,
          9.05289102,  11.93504894,  14.81720686,  17.69936478,
         20.5815227 ,  23.46368062,  26.34583855]),
 <a list of 10 Patch objects>)
In [48]:
coil_min = np.apply_along_axis(min, 3, coil_data1)
In [49]:
thres_mask = np.logical_and((coil_mu > 1), (coil_min > 0))
temp_map = thres_mask
z_inds = [5, 15, 25, 35]
f_c1, axarr = plt.subplots(2, len(z_inds), sharex=True, sharey=True, figsize = (18, 8))
for j in range(len(z_inds)):
    z_ind = z_inds[j]
    axarr[0, j].imshow(temp_map[:, :, z_ind], cmap='jet', vmin = 0, vmax = 1)
z_inds = [40, 45, 55, 65]
for j in range(len(z_inds)):
    z_ind = z_inds[j]
    axarr[1, j].imshow(temp_map[:, :, z_ind], cmap='jet', vmin = 0, vmax = 1)
In [50]:
thres_coords = np.array(np.unravel_index(np.where(thres_mask.ravel()==1)[0], np.shape(thres_mask))).T
thres_vox1 = np.sqrt(coil_data1[thres_coords[:, 0], thres_coords[:, 1], thres_coords[:, 2], :])
In [51]:
vox1_rdd = array_to_rdd(thres_vox1)
In [52]:
vox1_rdd.count()
Out[52]:
287963
In [55]:
vox1_rdd_f = vox1_rdd.filter(lambda x : np.var(x) > 0.01)
In [73]:
perm = npr.permutation(44)
tr_inds = perm[:22]
te_inds = perm[22:]
In [77]:
def trfilt(x):
    return x[tr_inds]
In [78]:
vox1_rdd_f.map(trfilt).first()
Out[78]:
array([ 1.13860953,  1.20869076,  1.20852578,  1.10540438,  0.95714295,
        1.08675909,  1.21515012,  1.19201946,  1.23757732,  1.11619246,
        0.85726202,  1.29897451,  1.11141551,  1.08068466,  1.17847407,
        1.23499882,  1.09175801,  1.3766861 ,  1.10741067,  1.10618067,
        1.20242417,  1.18167353])
In [79]:
model1 = KMeans.train(vox1_rdd_f.map(trfilt), 20)
In [86]:
def f01(x):
    return (model1.predict(x[tr_inds]), x)
In [87]:
vox1_rdd_fc = vox1_rdd_f.map(f01)
In [88]:
smp = vox1_rdd_fc.take(2)
In [64]:
def identity(x):
    return x

def combstack(x, y):
    if x is None:
        return y
    return np.vstack([x, y])
In [89]:
vox1_clusters = vox1_rdd_fc.aggregateByKey(None, combstack, combstack).values()
In [102]:
x = vox1_clusters.first()
In [91]:
vox1_clusters.map(lambda x : np.shape(x)[0]).collect()
Out[91]:
[4954,
 198,
 3152,
 1304,
 5188,
 3786,
 3943,
 5134,
 129,
 4209,
 4871,
 4886,
 651,
 3438,
 2426,
 2853,
 2783,
 1893,
 6764,
 5320]
In [92]:
npr.randint(0, 5)
Out[92]:
0
In [141]:
def simpute(x):
    n = np.shape(x)[0]
    if len(x) > 0:
        nits = 10
        inds_ko = te_inds[npr.randint(0, 22, n)]
        try:
            ho = np.array(x[range(n), inds_ko])
            x[range(n), inds_ko] = 0
            for i in range(nits):
                xt = rank1_approx(x)
                x[range(n), inds_ko] = xt[range(n), inds_ko] 
            est = x[range(n), inds_ko]
            return sum((ho- est)**2)
        except: # incorrect number of dimensions
            return -1
    else:
        return 0
In [108]:
vox1_clusters.map(simpute).collect()
Out[108]:
[147.33397563244807,
 25.329930172466426,
 227.12205784067334,
 83.85840449344353,
 175.67827381368426,
 251.83261202408801,
 103.94600031274005,
 115.77666766459922,
 48.749858338103152,
 191.09180649904255,
 177.77529121348294,
 189.3333289481684,
 72.894896607170949,
 207.72043556959369,
 146.20361454404792,
 91.898939708073982,
 162.42159641843568,
 114.53793981422466,
 159.41740443713735,
 138.86834713349452]
In [109]:
model1 = KMeans.train(vox1_rdd_f.map(trfilt), 2)
def f01(x):
    return (model1.predict(x[tr_inds]), x)
vox1_rdd_fc = vox1_rdd_f.map(f01)
vox1_clusters = vox1_rdd_fc.aggregateByKey(None, combstack, combstack).values()
vox1_clusters.map(simpute).reduce(add)
Out[109]:
2918.1640374934777
In [110]:
model1 = KMeans.train(vox1_rdd_f.map(trfilt), 2)
def f01(x):
    return (model1.predict(x[tr_inds]), x)
vox1_rdd_fc = vox1_rdd_f.map(f01)
vox1_clusters = vox1_rdd_fc.aggregateByKey(None, combstack, combstack).values()
vox1_clusters.map(simpute).reduce(add)
Out[110]:
2877.8035600243147
In [128]:
model1 = KMeans.train(vox1_rdd_f.map(trfilt), 3)
def f01(x):
    return (model1.predict(x[tr_inds]), x)
vox1_rdd_fc = vox1_rdd_f.map(f01)
vox1_clusters = vox1_rdd_fc.aggregateByKey(None, combstack, combstack).values()
vox1_clusters.map(simpute).reduce(add)
Out[128]:
2882.7178459754514
In [144]:
model1 = KMeans.train(vox1_rdd_f.map(trfilt), 800)
def f01(x):
    return (model1.predict(x[tr_inds]), x)
vox1_rdd_fc = vox1_rdd_f.map(f01)
vox1_clusters = vox1_rdd_fc.aggregateByKey(None, combstack, combstack).values()
vox1_clusters.map(simpute).reduce(add)
Out[144]:
2076.213229168512
In []:
res = []
for k in np.arange(20, 800, 20):
    model1 = KMeans.train(vox1_rdd_f.map(trfilt), k)
    def f01(x):
        return (model1.predict(x[tr_inds]), x)
    vox1_rdd_fc = vox1_rdd_f.map(f01)
    vox1_clusters = vox1_rdd_fc.aggregateByKey(None, combstack, combstack).values()
    res.append(vox1_clusters.map(simpute).reduce(add))
In [156]:
np.arange(20, 80, 800)
Out[156]:
array([20])
In [158]:
plt.plot(np.arange(20, 800, 20)[:len(res)], res)
Out[158]:
[<matplotlib.lines.Line2D at 0x7fec46623e90>]
In [127]:
vox1_clusters.map(simpute).take(3)
Out[127]:
[13.344788761254831, 11.100885200602098, 24.273189844545826]
In [118]:
np.shape(thres_vox1)
Out[118]:
(287963, 44)
In [119]:
cvox1_rdd = array_to_rdd(np.hstack([thres_coords, thres_vox1]))
In []: